from google.colab import drive
drive.mount('/content/drive/')
import sys
sys.path.append('drive/MyDrive/Colab_Notebooks/')
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
# Importing the testing packages
from configs import validateReadCSV as _validateReadCSV
from configs import validateSubsetDataframe as _validateSubsetDataframe
from configs import validateStandardNormalization as _validateStandardNormalization
from configs import validateKmeansClustering as _validateKmeansClustering
from configs import validateCalculateWCSS as _validateCalculateWCSS
The first excercise is to download a dataset using Pandas. An example dataset is provided that is downloaded from Kaggle as a csv file. Note that Pandas has a .read_csv method that enables reading in csv files directly into a pandas dataframe.
The code below is provided for reading the csv file from the current directory into the notebook.
# Note that you can use pandas also for reading in tsv, xlsx, csv, numpy arrays.
# Read in the dataset- This should be a line of code for reading in the csv file from the current directory
# The same code can be utilized to read in multiple files from their .csv format.
def grabDataFromCSV(URL = None):
dataframe3 = pd.read_csv(URL)
return dataframe3
dataframe = grabDataFromCSV("drive/MyDrive/Colab_Notebooks/livingwage_.csv")
#"drive/MyDrive/Colab_Notebooks/livingwage_.csv"
_validateReadCSV(grabDataFromCSV)
Select a non-numerical feature as well as at least 3 numerical features in the dataframe.
This can easily be done by sub-setting the dataframe to include in the first column - Non numerical feature and in the subsequent columns - A select few numerical features (>=3 features)
def subsetDataframe(dataframe, numNumericalFeatures =10):
""" Subset the dataframe to include one non-numerical feature in column 1 and several numerical features/attributes in the subsequent columns"""
# Hint you can identify the numerical features as those are features described in the method pandas.describe() above
numerical = set(list(dataframe.describe().columns))
numericalNonnumerical = set(list(dataframe.columns))
non_numerical = list(numericalNonnumerical - numerical)
non_numerical.sort()
# Hint select only one non-numerical attribute
non_numerical_ = non_numerical[0:1]
numerical_ =list(numerical)[0:numNumericalFeatures]
singleNon_numericalNumerical = non_numerical_ + numerical_
dataframe_ = dataframe[singleNon_numericalNumerical]
return dataframe_
dataframe_ = subsetDataframe(dataframe)
dataframe_.head(10)
_validateSubsetDataframe(subsetDataframe)
K means clustering works best when all the features are scaled appropraitely to the same dimension. This is called feature scaling. The different ways to do feature scaling are:
The code snippet below that we encourage students to utilize uses standardization as a feature scaling methodology. The formulae is $$x = (x-\alpha)/\mu$$. Students can also try any of the other scaling techniques as additional excercise and evaluate how that scaling technique impacts the clustering approach.
# create a new normalized column - This should be for Standard Normalization
# To ensure that the code passes ensure that the first column contains non-numerical feature (for identification purposes)
def standardNormalization(dataframe_new):
# Please write a function that performs standard normalization of the dataframe.
# The function should return the same dataframe that has all features normalized into additional columns
# For the additional/new columns containing the normalized features students can append '_norm' to the column name
# Subset and select only numerical columns in the dataframe
numerical = list(dataframe_new.describe().columns)
for norm_axis in numerical:
dataframe_new[norm_axis+'_norm'] = (dataframe_new[norm_axis] - dataframe_new[norm_axis].mean())/dataframe_new[norm_axis].std()
dataframe_new.head(20)
return None
standardNormalization(dataframe_)
_validateStandardNormalization(standardNormalization)
Prior to running K-means clustering on a multidimensional feature dataset, especially a dataset with features >2; We can do data visualization just to see how visually 2 features correlate with one another after scaling.
The code snippet below is optimized for the living wage dataset. This will be different for other datasets but should be the same methodology.
def makePlots(kmeansFrame,y=None):
# This short code will plot the features against one another using the matplotlib plot method.
# There is no unit test for this plotting function....
# Only make plots for column names that are numerical and have '_norm' i.e they are standardized
# This will significantly reduce the number of plots that have to be made
# write function to extract all numerical features with norm in them
numNorm = [colName for colName in list(kmeansFrame.describe().columns) if '_norm' in colName]
for column in numNorm:
for column_ in numNorm:
if column != column_:
fig, ax = plt.subplots(figsize=(7,7))
fig = plt.scatter(np.array(kmeansFrame[column]), kmeansFrame[column_],c=y, s=50, cmap='viridis')
plt.xlabel(column)
plt.ylabel(column_)
return None
makePlots(dataframe_)
The K-means clustering algorithm can be written either using numpy or using the sklearn library. The sklearn library provides a simple implementation using a few lines of codes.
The initial excercise that students work on involves a few lines of code that enables clustering using sklaern library.
def kmeansClustering(num_clusters,kmeansDataFrame,random_state=42):
# Generate the k-means clusters with the number of clusters = num_clusters
# Subset the dataframe to select only the numerical features
kmeansDataFrame_ = kmeansDataFrame[[colName for colName in list(kmeansDataFrame.describe().columns) if '_norm' in colName]]
# Initialize the Kmeans algorithm with a specified number of clusters
kmeans = KMeans(n_clusters= num_clusters,random_state=random_state)
# Fit the dataset to the Kmeans clustering algorithm
kmeans.fit(kmeansDataFrame_)
# Predict the cluster value for each of the datapoint
y_kmeans = kmeans.predict(kmeansDataFrame_)
return y_kmeans,kmeans
y_Kmeans,_ = kmeansClustering(8,dataframe_)
_validateKmeansClustering(kmeansClustering)
We can make the plot using the makePlots function above and pass in the cluster values predicted by the kmeansClustering algorithm.
makePlots(dataframe_,y_Kmeans)
Write code to calculate the within cluster sum of squares (WCSS). Recall the within cluster sum of squares (WCSS) is a measure of the squared average sum of squares of all points within the cluster to the cluster centroid.
The WCSS is unlike the between cluster sum of squares (BCSS) measures the square average of the distance between the clusters.
def calculateWCSS(num_clusters, kmeansDataFrame, random_state = 42):
# num_clusters is the number of clusters that will be analyzed for WCSS
# kmeansDataFrame contains the normalized feature(s)
# This code uses the Elbow methodology for analyzing the optimal number of clusters
wcss = []
# Ensure that only numerical variables are used for this analysis
kmeansDataFrame_ = kmeansDataFrame[list(kmeansDataFrame.describe().columns)]
for i in range(1,num_clusters):
k_means = KMeans(n_clusters=i, random_state= random_state)
k_means.fit(kmeansDataFrame_)
wcss.append(k_means.inertia_)
plt.plot(np.arange(1,num_clusters),wcss)
plt.xlabel('Clusters')
plt.ylabel('SSE')
plt.show()
return wcss
calculateWCSS(8,dataframe_)
_validateCalculateWCSS(calculateWCSS)
THIS IS A CHALLENGE EXCERCISE!!
The goal of this excercise is to pick any 2 dimensions in the features dataset and show the center of the cluster as well as a cluster radius for each cluster using matplotlib.plot
A better approach will be to use Principal Component Analysis to reduce dimensions down to 2 and then do clustering on those 2 dimensions.
Note that this is a challenge excercise for the students - Students are encouraged to do the excercise but no penalties if students decide not to do the excercise.
from sklearn.decomposition import PCA
# Hint #1 - Select the specific columns that are normalized and convert them into numpy arrays
newarray = dataframe_[[colName for colName in list(dataframe_.describe().columns) if '_norm' in colName]].to_numpy()
# Hint #2 - Do Principal component analysis to reduce dimension down to 2 to make it easier for visualization purposes
pcal = PCA(n_components=2)
val = pcal.fit_transform(newarray)
# Hint #3 - Do K-means clustering and choose the number of clusters
means = KMeans(n_clusters=3)
means.fit(val)
y_means = means.predict(val)
# Hint #4 - Do a scatter plot of the X-Component vs Y-Component
fig, ax = plt.subplots(figsize=(8,8))
fig = plt.scatter(val[:,0],val[:,1] , c=y_means, s=50, cmap='viridis')
centers = means.cluster_centers_
label1 = means.labels_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=100, alpha=0.5)
plt.xlabel('principal X component')
plt.ylabel('principal Y component')
# Hint #5 - Getting the Radii of the circles for each of the 3 labels identified
YMax = []
ValueLabels = []
for iii in list(np.unique(label1)):
YValues = []
for ii in range(len(label1)):
if label1[ii] == iii:
YY = list(centers[iii])
RValues = ((YY[0] - val[ii][0])**2 + (YY[1] - val[ii][1])**2)**0.5
YValues.append(RValues)
YMax.append(max(YValues))
ValueLabels.append(iii)
print(YMax)
# Hint #6 - Plotting the circles
circle1 = plt.Circle(((list(centers[0])[0]), (list(centers[0])[1])), YMax[0], color='blue', fill=False)
circle2 = plt.Circle(((list(centers[1])[0]), (list(centers[1])[1])), YMax[1], color='blue', fill=False)
circle3 = plt.Circle(((list(centers[2])[0]), (list(centers[2])[1])), YMax[2], color='blue', fill=False)
ax.add_artist(circle1)
ax.add_artist(circle2)
ax.add_artist(circle3)